Doublechecking net values

Some people have been reporting problems in the value of net_value column in the CEAP datasets. It doesn't seem to really match what it should contain.

In [1]:
import pandas as pd
import numpy as np

filenames = ['../data/2016-08-08-current-year.xz',
             '../data/2016-08-08-last-year.xz',
             '../data/2016-08-08-previous-years.xz']
dataset = pd.DataFrame()

for filename in filenames:
    data = pd.read_csv(filename,
                       parse_dates=[16],
                       dtype={'document_id': np.str,
                              'congressperson_id': np.str,
                              'congressperson_document': np.str,
                              'term_id': np.str,
                              'cnpj_cpf': np.str,
                              'reimbursement_number': np.str})
    dataset = pd.concat([dataset, data])
In [2]:
dataset['issue_date'] = pd.to_datetime(dataset['issue_date'], errors='coerce')
In [3]:
(dataset['document_value'].isnull()).sum()
Out[3]:
17
In [4]:
dataset[dataset['document_value'].isnull()]
Out[4]:
document_id congressperson_name congressperson_id congressperson_document term state party term_id subquota_number subquota_description ... net_value month year installment passenger leg_of_the_trip batch_number reimbursement_number reimbursement_value applicant_id
72863 5021378 ALMEIDA LIMA 160626 173 2011.0 SE PMDB 54 5 Publicity of parliamentary activity ... 10000.00 10 2012 0 NaN NaN 1005279 4159 NaN 2370
72864 5021389 ALMEIDA LIMA 160626 173 2011.0 SE PMDB 54 5 Publicity of parliamentary activity ... 11300.00 10 2012 0 NaN NaN 1005279 4159 NaN 2370
243933 5028079 CARLOS ZARATTINI 141398 398 2015.0 SP PT 55 10 Telecommunication ... 29.90 2 2013 0 NaN NaN 1007122 4189 NaN 1963
286036 5028084 DANIEL ALMEIDA 74060 188 2015.0 BA PCdoB 55 3 Fuels and lubricants ... 3113.38 1 2013 0 NaN NaN 1006976 4171 NaN 1562
422966 5062354 ELCIONE BARBALHO 74075 21 2015.0 PA PMDB 55 13 Congressperson meal ... 26.87 3 2013 0 NaN NaN 1016826 4238 NaN 1011
513864 5073020 FRANCISCO ESCÓRCIO 137983 567 2011.0 MA PMDB 54 3 Fuels and lubricants ... 150.00 3 2013 0 NaN NaN 1020110 4250 NaN 1762
652869 5063133 JANETE ROCHA PIETÁ 141455 358 2011.0 SP PT 54 15 Aircraft renting or charter of aircraft ... 15.00 3 2013 0 NaN NaN 1018289 4248 NaN 1852
670406 5020275 JESUS RODRIGUES 160671 115 2011.0 PI PT 54 4 Consultancy, research and technical work ... 7000.00 2 2013 0 NaN NaN 1004983 4185 NaN 2380
1031812 5069601 MISSIONÁRIO JOSÉ OLIMPIO 160561 375 2015.0 SP DEM 55 3 Fuels and lubricants ... 130.01 2 2013 0 NaN NaN 1018899 4247 NaN 2388
1095064 5009532 ONYX LORENZONI 74399 510 2015.0 RS DEM 55 3 Fuels and lubricants ... 2755.87 1 2013 0 NaN NaN 1002170 4139 NaN 1627
1118504 5030571 PADRE JOÃO 160556 259 2015.0 MG PT 55 3 Fuels and lubricants ... 137.20 2 2013 0 NaN NaN 1009721 4200 NaN 2305
1180554 5016649 PEDRO HENRY 74111 403 2011.0 MT PP 54 9 Flight tickets ... 486.95 1 2013 0 NaN Cuiabá/Brasilia 1003807 4187 NaN 1227
1212495 5020247 RAIMUNDO GOMES DE MATOS 74216 102 2015.0 CE PSDB 55 1 Participation in course, talk or similar event ... 223.30 1 2013 0 NaN NaN 1005054 4187 NaN 1244
1213738 5020251 RAIMUNDO GOMES DE MATOS 74216 102 2015.0 CE PSDB 55 8 Security service provided by specialized company ... 140.00 1 2013 0 NaN NaN 1005054 4187 NaN 1244
1214309 5020523 RAIMUNDO GOMES DE MATOS 74216 102 2015.0 CE PSDB 55 10 Telecommunication ... 181.84 1 2013 0 NaN NaN 1005053 4187 NaN 1244
1214331 5020531 RAIMUNDO GOMES DE MATOS 74216 102 2015.0 CE PSDB 55 10 Telecommunication ... 155.87 1 2013 0 NaN NaN 1005053 4187 NaN 1244
1214332 5020430 RAIMUNDO GOMES DE MATOS 74216 102 2015.0 CE PSDB 55 10 Telecommunication ... 4.70 1 2013 0 NaN NaN 1005053 4187 NaN 1244

17 rows × 29 columns

In [5]:
dataset[dataset['document_value'].isnull()].iloc[0]
Out[5]:
document_id                                               5021378
congressperson_name                                  ALMEIDA LIMA
congressperson_id                                          160626
congressperson_document                                       173
term                                                         2011
state                                                          SE
party                                                        PMDB
term_id                                                        54
subquota_number                                                 5
subquota_description          Publicity of parliamentary activity
subquota_group_id                                               0
subquota_group_description                                    NaN
supplier                                        JULIANO AMADEU ME
cnpj_cpf                                           11901432000190
document_number                                          00000007
document_type                                                   0
issue_date                                    2012-08-20 00:00:00
document_value                                                NaN
remark_value                                                 5000
net_value                                                   10000
month                                                          10
year                                                         2012
installment                                                     0
passenger                                                     NaN
leg_of_the_trip                                               NaN
batch_number                                              1005279
reimbursement_number                                         4159
reimbursement_value                                           NaN
applicant_id                                                 2370
Name: 72863, dtype: object
In [6]:
import math

dataset = dataset.dropna(subset=['document_value'])
dataset['document_value_int'] = (dataset['document_value'] * 100.).apply(math.ceil).astype(np.int)
dataset['remark_value_int'] = (dataset['remark_value'] * 100.).apply(math.ceil).astype(np.int)
dataset['net_value_int'] = (dataset['net_value'] * 100.).apply(math.ceil).astype(np.int)
dataset['calc_net_value_int'] = dataset['document_value_int'] - dataset['remark_value_int']
In [7]:
((dataset['calc_net_value_int'] - dataset['net_value_int']) != 0).sum()
Out[7]:
21434
In [8]:
dataset.iloc[0]
Out[8]:
document_id                                                             5928744
congressperson_name                                           ABEL MESQUITA JR.
congressperson_id                                                        178957
congressperson_document                                                       1
term                                                                       2015
state                                                                        RR
party                                                                       DEM
term_id                                                                      55
subquota_number                                                               1
subquota_description          Maintenance of office supporting parliamentary...
subquota_group_id                                                             0
subquota_group_description                                                  NaN
supplier                                COMPANHIA DE AGUAS E ESGOTOS DE RORAIMA
cnpj_cpf                                                         05939467000115
document_number                                                          146439
document_type                                                                 0
issue_date                                                  2016-02-15 00:00:00
document_value                                                            37.37
remark_value                                                                  6
net_value                                                                 31.37
month                                                                         1
year                                                                       2016
installment                                                                   0
passenger                                                                   NaN
leg_of_the_trip                                                             NaN
batch_number                                                            1268870
reimbursement_number                                                       5369
reimbursement_value                                                         NaN
applicant_id                                                               3074
document_value_int                                                         3737
remark_value_int                                                            600
net_value_int                                                              3137
calc_net_value_int                                                         3137
Name: 0, dtype: object
In [9]:
dataset['diff_net_value'] = dataset['calc_net_value_int'] - dataset['net_value_int']
dataset.loc[dataset['diff_net_value'] != 0, 'diff_net_value'].describe()
Out[9]:
count    2.143400e+04
mean     6.495887e+04
std      3.402186e+05
min     -1.763608e+06
25%     -1.000000e+00
50%     -1.000000e+00
75%      1.086200e+04
max      9.712858e+06
Name: diff_net_value, dtype: float64
In [10]:
with_significant_difference = dataset.loc[dataset['diff_net_value'].abs() > 2]
In [11]:
with_significant_difference['subquota_description'].describe()
Out[11]:
count                    9457
unique                     17
top       Flight ticket issue
freq                     4319
Name: subquota_description, dtype: object
In [12]:
from altair import *

Chart(with_significant_difference).mark_bar().encode(
    x=X('subquota_description:O',
        sort=SortField(field='subquota_description',
                       order='descending',
                       op='count')),
    y='count(*):Q',
)
In [13]:
with_significant_difference.iloc[0]
Out[13]:
document_id                                                            5914504
congressperson_name                                          ABEL MESQUITA JR.
congressperson_id                                                       178957
congressperson_document                                                      1
term                                                                      2015
state                                                                       RR
party                                                                      DEM
term_id                                                                     55
subquota_number                                                              8
subquota_description          Security service provided by specialized company
subquota_group_id                                                            0
subquota_group_description                                                 NaN
supplier                                                 VERONICA M.B.DA SILVA
cnpj_cpf                                                        08319201000168
document_number                                                         000292
document_type                                                                0
issue_date                                                 2016-01-20 00:00:00
document_value                                                            8500
remark_value                                                                 0
net_value                                                                  200
month                                                                        1
year                                                                      2016
installment                                                                  0
passenger                                                                  NaN
leg_of_the_trip                                                            NaN
batch_number                                                           1264155
reimbursement_number                                                      5381
reimbursement_value                                                        NaN
applicant_id                                                              3074
document_value_int                                                      850000
remark_value_int                                                             0
net_value_int                                                            20000
calc_net_value_int                                                      850000
diff_net_value                                                          830000
Name: 27, dtype: object
In [ ]: